
# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

[-
our $type;
our $dtype  = $type eq 'h' ?  'U16' :  '32';
our $dshift = $type eq 'h' ?    '1' :   '2';
our $dsize  = $type eq 'h' ?    '2' :   '4';
our $vsize  = $type eq 'h' ?   '64' : '128';

sub dtype  { return $dtype;  }
sub dsize  { return $dsize;  }
sub dshift { return $dshift; }

our $convert_in  = $type eq 'h' ? 'F2F.F32.F16' : '';
our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : '';
-]

<CONSTANT_MAPPING>
    param_S[0]         : c[0x0][0x140]
    param_S[1]         : c[0x0][0x144]
    param_X[0]         : c[0x0][0x148]
    param_X[1]         : c[0x0][0x14c]
    param_O[0]         : c[0x0][0x150]
    param_O[1]         : c[0x0][0x154]
    param_I[0]         : c[0x0][0x158]
    param_I[1]         : c[0x0][0x15c]
    param_F[0]         : c[0x0][0x160]
    param_F[1]         : c[0x0][0x164]
    param_alpha        : c[0x0][0x168]
    param_beta         : c[0x0][0x16c]
    param_flags        : c[0x0][0x170]
    param_C            : c[0x0][0x174]
    param_H            : c[0x0][0x178]
    param_P            : c[0x0][0x17c]
    param_pad_h        : c[0x0][0x180]
    param_pad_w        : c[0x0][0x184]
    param_HWN          : c[0x0][0x188]
    param_WN           : c[0x0][0x18c]
    param_PQN          : c[0x0][0x190]
    param_QN           : c[0x0][0x194]
    param_Qnk          : c[0x0][0x198]
    param_nk           : c[0x0][0x19c]
    param_n            : c[0x0][0x1a0]
    param_k            : c[0x0][0x1a4]
    param_magic_Qnk    : c[0x0][0x1a8]
    param_shift_Qnk    : c[0x0][0x1ac]
    param_magic_nk     : c[0x0][0x1b0]
    param_shift_nk     : c[0x0][0x1b4]
    param_magic_k      : c[0x0][0x1b8]
    param_shift_k      : c[0x0][0x1bc]
    param_RSK          : c[0x0][0x1c0]
    param_4RSKp        : c[0x0][0x1c4]
    param_4HWNp        : c[0x0][0x1c8]
    param_gridK        : c[0x0][0x1cc]
    param_gridP2       : c[0x0][0x1d0]
    param_gridQ        : c[0x0][0x1d4]
    param_gridN        : c[0x0][0x1d8]
    param_gridQN       : c[0x0][0x1dc]
    param_gridPQN      : c[0x0][0x1e0]
    param_superP       : c[0x0][0x1e4]
    param_superQ       : c[0x0][0x1e8]
    param_superN       : c[0x0][0x1ec]
    param_shiftP       : c[0x0][0x1f0]
    param_shiftQ       : c[0x0][0x1f4]
    param_shiftN       : c[0x0][0x1f8]
</CONSTANT_MAPPING>

<REGISTER_MAPPING>

       0-63 : czero<00-63>

     3, 2,11,10,19,18,27,26 : cx<0-7>y0
     7, 6,15,14,23,22,31,30 : cx<0-7>y1
     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
     5, 4,13,12,21,20,29,28 : cx<0-7>y3
    35,34,43,42,51,50,59,58 : cx<0-7>y4
    39,38,47,46,55,54,63,62 : cx<0-7>y5
    33,32,41,40,49,48,57,56 : cx<0-7>y6
    37,36,45,44,53,52,61,60 : cx<0-7>y7

      64-79 : j0Ix<0-7>, j0Fy<0-7>
      80-95 : j1Ix<0-7>, j1Fy<0-7>

      64-79 ~ tid, idx_P, idx_Q, idx_N, idx_K, idx_n, idx_k, tid16, tid31, c, addr_zero, partialC
     80-119 ~ tid1, idx_PQnk, idx_Qnk, idx_nk, magic_Qnk, neg_Qnk, neg_nk, neg_k, div<1-3>, idx_P2, idx_Q2, z<1-2>, negOne, super_P, super_Q
      80-95 ~ super_N, y, x, ti, ti_sign, x<1-3>, mask_x, preds1, offsetIC
      80-95 ~ tf, tid31_4, offsetFC

    120-121 : track<0-1>
    122-127 ~ writeS, readFs, readIs, C, preds, idx_nkpq

      80-95 ~ p, q, n, tid32, tid64, tid_16, tid_1, q2, p2, to, superP, superQ, superN
      96-99 : Out<0-1>, Sum<0-1>
    100-121 ~ alpha, one, writeCs, readCs, k, PQN15, tid_31, out_offset, bsum_offset

      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1

      64-79 : m0<0-3>, m1<0-3>, m2<0-3>, m3<0-3>
      80-95 : t0<0-1>, t1<0-1>, t2<0-1>, t3<0-1>

    3,2,11,10,19,18,27,26,1,0,9,8,17,16,25,24 ~ b<00|01|10|11>, x<00|01|10|11>, sum<0|1>, s0<0-1>, s1<0-1>

        // Image registers (registers assigned to avoid bank conflicts)
         96 = i00
         97 = i01
         98 = i02
         99 = i03
        100 = i30
        101 = i31
        102 = i32
        103 = i33
        105 = i13
        104 = i12
        107 = i11
        106 = i10
        108 = i23, TI23, I23
        109 = i22, TI22
        110 = i21, TI21
        111 = i20, TI20, I20
        113 = TI00, I00, TI10, I10, I21, I01
        112 = TI01, I11
        115 = TI02, I12
        114 = TI03, I03, TI11, I31
        116 = TI30, I30, TI12, I32
        117 = TI31
        118 = TI32
        119 = TI33, I33, TI13, I13, I22, I02
    // Filter registers
[+
    our $FX;
    return $FX ? q{
    104-119 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>
    } : q{
         96 = f00, TF00, F00
         97 = f01, TF01
         98 = f02, TF02, F03
         99 = f10
        100 = f11
        101 = f12
        102 = f20, TF30, F30
        103 = f21, TF31
        104 = f22, TF32, F33
        105 = tb3, F32
        106 = tb0, F02
        107 = ta2, TF22, F23
        108 = ta0, TF20, F20
        109 = ta1, TF21
        110 = F01
        111 = F31
        112 = TF10, F10
        113 = TF11
        114 = TF12, F13
        115 = tb1, F12
        116 = tb2, F22
        117 = F11
        118 = F21
    };
+]
</REGISTER_MAPPING>

--:-:1:-:1      S2R tid,      SR_TID.X;
--:-:2:-:1      S2R idx_PQnk, SR_CTAID.X;
<SCHEDULE_BLOCK>
01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 128, PT;

--:-:-:-:1      LOP.AND tid31, tid, 31;

// c = (tid & 127) / 32
--:-:-:-:1      BFE.U32 c, tid, 0x205; // 2 bits at position 5

--:-:-:-:1      SHL addr_zero, tid31, 4;
--:-:-:-:1      ISCADD addr_zero, c, addr_zero, 11;
--:-:-:-:1  @P0 IADD addr_zero, addr_zero, 4x<512*4>;

--:-:-:-:1      STS.128 [addr_zero + 4x<00*4>], RZ;
--:-:-:-:1      STS.128 [addr_zero + 4x<32*4>], RZ;
--:-:-:-:1      STS.128 [addr_zero + 4x<64*4>], RZ;
--:-:-:-:1      STS.128 [addr_zero + 4x<96*4>], RZ;

[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]


// idx_P2 = idx_PQnk / blk_Qnk
--:-:-:-:1      MOV  magic_Qnk, param_magic_Qnk;
--:-:-:-:1      ISETP.NE.AND P1, PT,  magic_Qnk, 1, PT;
02:-:-:-:1  @P1 XMAD     div1, idx_PQnk,    magic_Qnk,    RZ;
--:-:-:-:1  @P1 XMAD     div2, idx_PQnk,    magic_Qnk.H1, RZ;
--:-:-:-:1  @P1 XMAD     div3, idx_PQnk.H1, magic_Qnk.H1, RZ;
--:-:-:-:1  @P1 XMAD.CHI div1, idx_PQnk.H1, magic_Qnk,    div1;
--:-:-:-:1  @P1 IADD3.RS idx_P2, div1, div2, div3;
--:-:-:-:1  @P1 SHR.U32  idx_P2, idx_P2,   param_shift_Qnk;
--:-:-:-:1 @!P1 SHR.U32  idx_P2, idx_PQnk, param_shift_Qnk;

// idx_Qnk = idx_PQnk % blk_Qnk
--:-:-:-:1      IADD neg_Qnk, RZ, -param_Qnk;
--:-:-:-:1      XMAD.LO2 idx_Qnk, neg_Qnk, idx_P2, idx_PQnk;

// idx_Q2  = idx_Qnk / nk
--:-:-:-:1      XMAD.LO2C idx_Q2, idx_Qnk, param_magic_nk, RZ;
--:-:-:-:1      SHR.U32   idx_Q2, idx_Q2,  param_shift_nk;
// idx_nk = idx_Qnk % nk
--:-:-:-:1      IADD neg_nk, RZ, -param_nk;
--:-:-:-:1      XMAD.S16.U16  idx_nk, neg_nk, idx_Q2, idx_Qnk;

// idx_n = idx_nk / k
--:-:-:-:1      XMAD    idx_n,  idx_nk, param_magic_k, RZ;
--:-:-:-:1      SHR.U32 idx_n,  idx_n,  param_shift_k;
// idx_k = idx_nk % k
--:-:-:-:1      IADD neg_k, RZ, -param_k;
--:-:-:-:1      XMAD.S16.U16 idx_k, neg_k, idx_n, idx_nk;

// Implement a square wave block id remapping (for all but last row (if odd number of rows))
// idx_P = idx_P2 * 2
// idx_Q = idx_Q2
// if idx_P2 != gridP2:
//     idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1)
//     idx_Q  = idx_Q2 >> 1
--:-:-:-:1      ISETP.NE.AND P1, PT, idx_P2, param_gridP2, PT;
--:-:-:-:1      SHL idx_P, idx_P2, 1;
--:-:-:-:1  @P1 LOP.AND z1, idx_Q2, 1;
--:-:-:-:1  @P1 BFE.U32 z2, idx_Q2, 0x101; // 1 bit at position 1
--:-:-:-:1  @P1 LOP.XOR z1, z1, z2;
--:-:-:-:1  @P1 IADD idx_P, idx_P, z1;
--:-:-:-:1  @P1 SHR.U32 idx_Q, idx_Q2, 1;
--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2;

// Scan backwards on odd rows
// if idx_P2 & 1:
//     idx_Q = gridQ - idx_Q - 1
--:-:-:-:1      LOP.AND.NZ P2, RZ, idx_P2, 1;
--:-:-:-:1      MOV negOne, -1;
--:-:-:-:1  @P2 IADD3 idx_Q, -idx_Q, param_gridQ, negOne;

--:-:-:-:1      BFI idx_nkpq, idx_P, 0x0c0c, idx_Q;
--:-:-:-:1      BFI idx_nkpq, idx_k, 0x0418, idx_nkpq;
--:-:-:-:1      BFI idx_nkpq, idx_n, 0x041c, idx_nkpq;

// x = grid_x << shiftX
// y = grid_y << shiftY
--:-:-:-:1      SHL idx_P, idx_P, param_shiftP;
--:-:-:-:1      SHL idx_Q, idx_Q, param_shiftQ;

// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp
--:-:-:-:1      BFE.U32 super_P, tid, param_superP;
--:-:-:-:1      BFE.U32 super_Q, tid, param_superQ;
--:-:-:-:1      ISCADD idx_P, super_P,  idx_P, 1;
--:-:-:-:1      ISCADD idx_Q, super_Q,  idx_Q, 1;

// If this value is not a multiple of 4 we want to grab the partial amount on the first fetch.
// If it is a multiple of 4 then make a full 4 line fetch.
--:-:-:-:1      MOV C, param_C;
--:-:-:-:1      LOP.AND.Z P6, partialC, C, 3;
--:-:-:-:1 @!P6 IADD3 C, C, 4, -partialC;
--:-:-:-:1  @P6 MOV partialC, 4;
// P6 = c < partialC
--:-:-:-:1      ISETP.LT.AND P6, PT, c, partialC, PT;

[+
    our $FX; return $FX ? '' : q{
// writeS = c*512 + tid & 31
--:-:-:-:1      ISCADD writeS, c, tid31, 9;
--:-:-:-:1      ISCADD writeS, writeS, 4x<512*4*2>, 2;
    }
+]

// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
// readFs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
--:-:-:-:1      LOP.AND  tid16,  tid,   -16;
--:-:-:-:1      SHR.U32  tid16,  tid16,  1;

--:-:-:-:1      BFE.U32  readIs, tid,    0x201; // 2 bits at position 1
--:-:-:-:1      LOP.OR   readIs, readIs, tid16;
--:-:-:-:1      SHL      readIs, readIs, 4;

--:-:-:-:1      LOP.AND  tid1,   tid,    1;
--:-:-:-:1      LOP.AND  readFs, tid,    8;
--:-:-:-:1      SHR.U32  readFs, readFs, 2;
--:-:-:-:1      LOP3.LUT readFs, readFs, tid16, tid1, 0xfe;
--:-:-:-:1      ISCADD   readFs, readFs, 4x<512*4>, 4;
</SCHEDULE_BLOCK>

--:-:-:-:5  @P0 BRA.U FILTER_SETUP;

--:-:1:-:2      S2R idx_N, SR_CTAID.Z;


<SCHEDULE_BLOCK>

// writeS = c*512 + tid & 31
[+
    our $FX;
    return $FX ? q{
--:-:-:-:1      ISCADD writeS, c, tid31, 9;
--:-:-:-:1      ISCADD writeS, writeS, 4x<512*4*2>, 2;
    } : '';
+]

--:-:-:-:1      LOP.AND super_N, tid, param_superN;

01:-:-:-:1      XMAD idx_N, idx_N, param_n, idx_n;
--:-:-:-:1      SHL  idx_N, idx_N, param_shiftN;
--:-:-:-:1      IADD idx_N, idx_N, super_N;

// n < N
--:-:-:-:1      ISETP.LT.AND P5, PT, idx_N, 1x<$N>, PT;

// Subtract off the padding
--:-:-:-:1      IADD y, idx_P, -param_pad_h;
--:-:-:-:1      IADD x, idx_Q, -param_pad_w;

// a0 = n + x*N + y*XN + c*YXN
--:-:-:-:1      XMAD.S16.U16      ti, x,  1x<$N>,    idx_N;
--:-:-:-:1      XMAD.S16.U16.LO2C ti, y,  param_WN,  ti;
--:-:-:-:1      XMAD.S16.U16.LO2C ti, c,  param_HWN, ti;
--:-:-:-:1      ISET.LT.AND ti_sign, ti, RZ, PT;
--:-:-:-:1      LEA    track0.CC, ti,      param_I[0], [+ dshift() +];
--:-:-:-:1      IADD.X track1,    ti_sign, param_I[1];

--:-:-:-:1      IADD x1, x, 1;
--:-:-:-:1      IADD x2, x, 2;
--:-:-:-:1      IADD x3, x, 3;

--:-:-:-:1      ISETP.LT.AND P0, PT, x,  1x<$W>, PT;
--:-:-:-:1      ISETP.LT.AND P1, PT, x1, 1x<$W>, PT;
--:-:-:-:1      ISETP.LT.AND P2, PT, x2, 1x<$W>, PT;
--:-:-:-:1      ISETP.LT.AND P3, PT, x3, 1x<$W>, PT;
--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
--:-:-:-:1      P2R mask_x, PR, RZ, 0x0f;

--:-:-:-:1      IADD x1, y, 1;
--:-:-:-:1      IADD x2, y, 2;
--:-:-:-:1      IADD x3, y, 3;
--:-:-:-:1      ISETP.LT.AND P0, PT, y,  param_H, P5;
--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_H, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_H, P5;
--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_H, P5;
--:-:-:-:1      ISETP.GE.AND P0, PT, y,  RZ, P0;
--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;

--:-:-:-:1      SEL preds, mask_x, RZ, P0;
--:-:-:-:1  @P1 BFI preds, mask_x, 0x404, preds;
--:-:-:-:1  @P2 BFI preds, mask_x, 0x408, preds;
--:-:-:-:1  @P3 BFI preds, mask_x, 0x40c, preds;

// For partial C on first load
--:-:-:-:1      SEL preds1, preds, RZ, P6;

// offsetIC = partialC*YXN
--:-:-:-:1      XMAD.LO2C offsetIC, partialC, param_HWN, RZ;

--:-:-:-:1      R2P PR, preds1, 0x0f;
--:-:-:-:1      SHF.R.U64 preds1, preds1, 12, preds1;

--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>];
--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>];
--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>];
--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>];
--:-:-:-:1 @!P0 MOV i00, RZ;
--:-:-:-:1 @!P1 MOV i01, RZ;
--:-:-:-:1 @!P2 MOV i02, RZ;
--:-:-:-:1 @!P3 MOV i03, RZ;

--:-:-:-:1      R2P PR, preds1, 0x0f;
--:-:-:-:1      SHF.L.U64 preds1, preds1, 8, preds1;

--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>];
--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>];
--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>];
--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>];
--:-:-:-:1 @!P0 MOV i30, RZ;
--:-:-:-:1 @!P1 MOV i31, RZ;
--:-:-:-:1 @!P2 MOV i32, RZ;
--:-:-:-:1 @!P3 MOV i33, RZ;

--:-:-:-:1      R2P PR, preds1, 0x0f;
--:-:-:-:1      SHF.R.U64 preds1, preds1, 4, preds1;

--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>];
--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>];
--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>];
--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>];
--:-:-:-:1 @!P0 MOV i10, RZ;
--:-:-:-:1 @!P1 MOV i11, RZ;
--:-:-:-:1 @!P2 MOV i12, RZ;
--:-:-:-:1 @!P3 MOV i13, RZ;

--:-:-:-:1      R2P PR, preds1, 0x0f;

--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>];
--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>];
--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>];
--:6:2:-:1  @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>];
--:-:-:-:1 @!P0 MOV i20, RZ;
--:-:-:-:1 @!P1 MOV i21, RZ;
--:-:-:-:1 @!P2 MOV i22, RZ;
--:-:-:-:1 @!P3 MOV i23, RZ;
</SCHEDULE_BLOCK>

--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*512 + 00>];
--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*512 + 00>];
--:-:-:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*512 + 16>];
--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*512 + 16>];

20:-:-:-:6      LEA      track0.CC, offsetIC, track0,     [+ dshift() +];
--:-:-:-:0      LEA.HI.X track1,    offsetIC, track1, RZ, [+ dshift() +];

--:-:-:-:5      BRA.U IMAGE_LOOP;



FILTER_SETUP:

--:-:1:-:2      S2R idx_K, SR_CTAID.Y;

<SCHEDULE_BLOCK>
01:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;

[+
    our ($dtype, $dshift, $FX, $K, $vsize, $dsize);
    return $FX ? qq{

// writeS = (c*512 + (tid & 31)*4)*4
--:-:-:-:1      SHL writeS, tid31, 4;
--:-:-:-:1      ISCADD writeS, c, writeS, 11;
--:-:-:-:1      LOP.XOR writeS, writeS, 4x<512*4*2>;

// (kBlks,C,4,4,32)
// offset = idx_K*C*512 + c*512 + tid31*4;
--:-:-:-:1      SHL    tid31_4,  tid31, 2;
--:-:-:-:1      XMAD   tf, idx_K, param_C, c;
--:-:-:-:1      ISCADD tf, tf, tid31_4, 9;
--:-:-:-:1      LEA      track0.CC, tf, param_F[0],     $dshift;
--:-:-:-:1      LEA.HI.X track1,    tf, param_F[1], RZ, $dshift;

// offsetFC = partialC*512
--:-:-:-:1      SHL  offsetFC, partialC, 9;

--:-:-:-:1 \@!P6 LDS.U.$vsize F0, [addr_zero];
--:-:-:-:1 \@!P6 LDS.U.$vsize F1, [addr_zero];
--:-:-:-:1 \@!P6 LDS.U.$vsize F2, [addr_zero];
--:-:-:-:1 \@!P6 LDS.U.$vsize F3, [addr_zero];
<ORDERED>
--:-:2:-:1  \@P6 LDG.E.CG.$vsize F0, [track + 4x<00 * $dsize>];
--:-:3:-:1  \@P6 LDG.E.CG.$vsize F1, [track + 4x<32 * $dsize>];
--:-:4:-:1  \@P6 LDG.E.CG.$vsize F2, [track + 4x<64 * $dsize>];
--:6:5:-:1  \@P6 LDG.E.CG.$vsize F3, [track + 4x<96 * $dsize>];
</ORDERED>

    } : qq{
// k = idx_K*32 + tid & 31
--:-:-:-:1      ISCADD  idx_K, idx_K, tid31,  5;
--:-:-:-:1      ISETP.LT.AND P0, PT, idx_K, 1x<$K>, P6;
--:-:-:-:1      ISETP.LT.AND P1, PT, idx_K, 1x<$K>, PT;

// offsetFC = partialC * RSK
--:-:-:-:1      XMAD.LO2C offsetFC, partialC, param_RSK, RZ;

// a0 = k + c*RSK
--:-:-:-:1      XMAD.LO2C tf, c, param_RSK, idx_K;

--:-:-:-:1      LEA      track0.CC, tf, param_F[0],     $dshift;
--:-:-:-:1      LEA.HI.X track1,    tf, param_F[1], RZ, $dshift;

--:-:-:-:1 \@!P0 MOV f00, RZ;
--:-:-:-:1 \@!P0 MOV f01, RZ;
--:-:-:-:1 \@!P0 MOV f02, RZ;
--:-:-:-:1  \@P0 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];
--:-:-:-:1  \@P0 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];
--:-:-:-:1  \@P0 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];
--:-:-:-:1 \@!P0 MOV f20, RZ;
--:-:-:-:1 \@!P0 MOV f21, RZ;
--:-:-:-:1 \@!P0 MOV f22, RZ;
--:-:-:-:1  \@P0 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];
--:-:-:-:1  \@P0 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];
--:-:-:-:1  \@P0 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];
--:-:-:-:1 \@!P0 MOV f10, RZ;
--:-:-:-:1 \@!P0 MOV f11, RZ;
--:-:-:-:1 \@!P0 MOV f12, RZ;
--:-:-:-:1  \@P0 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];
--:-:-:-:1  \@P0 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];
--:6:2:-:1  \@P0 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];
    };
+]
</SCHEDULE_BLOCK>

--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*512 + 00>];
--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*512 + 00>];
--:-:-:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*512 + 16>];
--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*512 + 16>];

20:-:-:-:6      LEA      track0.CC, offsetFC, track0,     [+ dshift() +];
--:-:-:-:0      LEA.HI.X track1,    offsetFC, track1, RZ, [+ dshift() +];

--:-:-:-:5      BRA.U FILTER_LOOP;


IMAGE_LOOP:
--:-:-:-:1      ISETP.GT.AND P6, PT, C, 4, PT;
[+
    our ($dtype, $dsize, $convert_in, $W, $N);
    my %insert = (

        j0c1  => "--:-:-:-:1      ISETP.GT.AND P5, PT, C, RZ, PT;\n" .
                 "--:-:-:-:1      IADD C, C, -4;\n",


        j0c14 => "--:-:-:-:1      R2P PR, preds, 0x0f;\n",
        j0c16 => "--:-:-:-:1  \@P6 SHF.R.U64 preds, preds, 12, preds;\n",

        $convert_in ? (
            j0c3  => "02:-:-:-:1      $convert_in i00, i00;\n",
            j0c5  => "--:-:-:-:1      $convert_in i01, i01;\n",
            j0c7  => "--:-:-:-:1      $convert_in i02, i02;\n",
            j0c9  => "--:-:-:-:0 \@!P6 MOV preds, RZ;\n" .
                     "--:-:-:-:1      $convert_in i03, i03;\n",

            j0c11 => "--:-:-:-:1      $convert_in i20, i20;\n",
            j0c13 => "--:-:-:-:1      $convert_in i21, i21;\n",
            j0c15 => "--:-:-:-:1      $convert_in i22, i22;\n",
            j0c17 => "--:-:2:-:1      $convert_in i23, i23;\n",

            j0c19 => "--:-:-:-:1      $convert_in i10, i10;\n",
            j0c21 => "--:-:-:-:1      $convert_in i11, i11;\n",
            j0c23 => "--:-:-:-:1      $convert_in i12, i12;\n",
            j0c25 => "--:-:-:-:1      $convert_in i13, i13;\n",

            j0c27 => "--:-:-:-:1      $convert_in i30, i30;\n",
            j0c29 => "--:-:-:-:1      $convert_in i31, i31;\n",
            j0c31 => "--:-:-:-:1      $convert_in i32, i32;\n",
            j0c33 => "--:-:3:-:1      $convert_in i33, i33;\n",
        ) : (
            j0c9  => "--:-:-:-:1 \@!P6 MOV preds, RZ;\n",
        ),

        j0c32 => "02:-:-:-:1  \@P5 FADD TI00, i00, -i20;\n" .
                 "--:-:-:-:1  \@P5 FADD TI01, i01, -i21;\n" .
                 "--:-:-:-:1  \@P5 FADD TI02, i02, -i22;\n" .
                 "--:-:-:-:1  \@P5 FADD TI03, i03, -i23;\n",

        j0c35 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i00, [track + ${dsize}x<0*$W*$N + 0*$N>];\n",
        j0c37 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i01, [track + ${dsize}x<0*$W*$N + 1*$N>];\n",
        j0c39 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i02, [track + ${dsize}x<0*$W*$N + 2*$N>];\n",
        j0c41 => "--:-:-:-:1  \@P3 LDG.E.CI.$dtype i03, [track + ${dsize}x<0*$W*$N + 3*$N>];\n",
        j0c43 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i00, RZ;\n",
        j0c45 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i01, RZ;\n",
        j0c47 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i02, RZ;\n",
        j0c49 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i03, RZ;\n" .
                 "--:-:-:-:1      R2P PR, preds, 0x0f;\n",

        j0c50 => "--:-:-:-:1  \@P6 SHF.L.U64 preds, preds, 8, preds;\n",

        j0c55 => "04:-:-:-:1  \@P5 FADD TI30, i10, -i30;\n" .
                 "--:-:-:-:1  \@P5 FADD TI31, i11, -i31;\n" .
                 "--:-:-:-:1  \@P5 FADD TI32, i12, -i32;\n" .
                 "--:-:-:-:1  \@P5 FADD TI33, i13, -i33;\n",

        j0c57 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i30, [track + ${dsize}x<3*$W*$N + 0*$N>];\n",
        j0c59 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i31, [track + ${dsize}x<3*$W*$N + 1*$N>];\n",
        j0c61 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i32, [track + ${dsize}x<3*$W*$N + 2*$N>];\n",
        j0c63 => "--:-:-:-:1  \@P3 LDG.E.CI.$dtype i33, [track + ${dsize}x<3*$W*$N + 3*$N>];\n",
        j1c1  => "--:-:-:-:1 \@!P0 I2F.F32.U32 i30, RZ;\n",
        j1c3  => "--:-:-:-:1 \@!P1 I2F.F32.U32 i31, RZ;\n",
        j1c5  => "--:-:-:-:1 \@!P2 I2F.F32.U32 i32, RZ;\n",
        j1c7  => "--:-:-:-:1 \@!P3 I2F.F32.U32 i33, RZ;\n" .
                 "--:-:-:-:1      R2P PR, preds, 0x0f;\n" .
                 "--:-:-:-:1  \@P5 FADD I00, TI00, -TI02;\n" .
                 "--:-:-:-:1  \@P5 FADD I03, TI01, -TI03;\n" .
                 "--:-:-:-:1  \@P5 FADD I30, TI30, -TI32;\n" .
                 "--:-:-:-:1  \@P5 FADD I33, TI31, -TI33;\n" .
                 "--:-:-:-:1  \@P6 SHF.R.U64 preds, preds, 4, preds;\n",

        j1c9  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 0)>], I00;\n",
        j1c11 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 3)>], I03;\n",
        j1c13 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 0)>], I30;\n",
        j1c15 => "--:3:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 3)>], I33;\n",


        j1c29 => "04:-:-:-:1  \@P5 FADD TI10,  i10, i20;\n" .
                 "--:-:-:-:1  \@P5 FADD TI20, -i10, i20;\n" .
                 "--:-:-:-:1  \@P5 FADD TI11,  i11, i21;\n" .
                 "--:-:-:-:1  \@P5 FADD TI21, -i11, i21;\n" .
                 "--:-:-:-:1  \@P5 FADD TI12,  i12, i22;\n" .
                 "--:-:-:-:1  \@P5 FADD TI22, -i12, i22;\n" .
                 "--:-:-:-:1  \@P5 FADD TI13,  i13, i23;\n" .
                 "--:-:-:-:1  \@P5 FADD TI23, -i13, i23;\n",

        j1c30 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i10, [track + ${dsize}x<1*$W*$N + 0*$N>];\n",
        j1c32 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i11, [track + ${dsize}x<1*$W*$N + 1*$N>];\n",
        j1c34 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i12, [track + ${dsize}x<1*$W*$N + 2*$N>];\n",
        j1c36 => "--:-:-:-:1  \@P3 LDG.E.CI.$dtype i13, [track + ${dsize}x<1*$W*$N + 3*$N>];\n",
        j1c38 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i10, RZ;\n",
        j1c40 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i11, RZ;\n",
        j1c42 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i12, RZ;\n",
        j1c44 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i13, RZ;\n" .
                 "--:-:-:-:1      R2P PR, preds, 0x0f;\n" .
                 "--:-:-:-:1  \@P5 FADD I10, TI10, -TI12;\n" .
                 "--:-:-:-:1  \@P5 FADD I20, TI20, -TI22;\n" .
                 "--:-:-:-:1  \@P5 FADD I13, TI11, -TI13;\n" .
                 "--:-:-:-:1  \@P5 FADD I23, TI21, -TI23;\n" .
                 "--:-:-:-:1  \@P6 SHF.L.U64 preds, preds, 8, preds;\n",

        j1c46 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 0)>], I10;\n",
        j1c48 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 0)>], I20;\n",
        j1c50 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 3)>], I13;\n",
        j1c52 => "--:3:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 3)>], I23;\n",


        j2c8  => "04:-:-:-:1  \@P5 FADD I21,  TI21, TI22;\n" .
                 "--:-:-:-:1  \@P5 FADD I22, -TI21, TI22;\n",

        j2c11 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 1)>], I21;\n",
        j2c13 => "--:3:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 2)>], I22;\n",

        j2c15 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i20, [track + ${dsize}x<2*$W*$N + 0*$N>];\n",
        j2c17 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i21, [track + ${dsize}x<2*$W*$N + 1*$N>];\n",
        j2c19 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i22, [track + ${dsize}x<2*$W*$N + 2*$N>];\n",
        j2c21 => "--:6:2:-:1  \@P3 LDG.E.CI.$dtype i23, [track + ${dsize}x<2*$W*$N + 3*$N>];\n",
        j2c23 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i20, RZ;\n",
        j2c25 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i21, RZ;\n",
        j2c27 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i22, RZ;\n",
        j2c29 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i23, RZ;\n",

        j2c30 => "04:-:-:-:1  \@P5 FADD I01,  TI01, TI02;\n" .
                 "--:-:-:-:1  \@P5 FADD I02, -TI01, TI02;\n" .
                 "--:-:-:-:1  \@P5 FADD I11,  TI11, TI12;\n" .
                 "--:-:-:-:1  \@P5 FADD I12, -TI11, TI12;\n" .
                 "--:-:-:-:1  \@P5 FADD I31,  TI31, TI32;\n" .
                 "--:-:-:-:1  \@P5 FADD I32, -TI31, TI32;\n",

        j2c31 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 1)>], I01;\n",
        j2c33 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 2)>], I02;\n",
        j2c35 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 1)>], I11;\n",
        j2c37 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 2)>], I12;\n",
        j2c39 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 1)>], I31;\n",
        j2c41 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 2)>], I32;\n",

        j2c62 => "--:-:-:-:5      BAR.SYNC 0;\n" .
                 "--:-:-:-:1  \@P5 LOP.XOR readIs, readIs, 4x<512*4*2>;\n" .
                 "--:-:-:-:1  \@P5 LOP.XOR readFs, readFs, 4x<512*4*2>;\n" .
                 "--:-:-:-:1  \@P5 LOP.XOR writeS, writeS, 4x<512*4*2>;\n",

        j3c57 => "20:-:-:-:1  \@P6 IADD   track0.CC, track0, param_4HWNp;\n",
        j3c62 => "--:-:-:-:1  \@P6 IADD.X track1,    track1, RZ;\n",

        j3c63 => "--:-:-:Y:5  \@P5 BRA.U IMAGE_LOOP;\n" .
                 "--:-:-:Y:5      BRA.U END_LOOP;\n",
    );
    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2,4,6)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }
    my $out;
    foreach my $j (0 .. 3)
    {
        my $odd      = $j & 1;
        my $nOdd     = !$odd + 0;
        my $rsOffset = ($j + 1) % 4;
        my $rsPred   = $j == 3 ? '@P5' : '   ';

        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;

        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;

            my $yield  = $c == 32 && $stall ? 'Y' : '-';

            my $wait   = $c == 0 ? '01' : '--';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
        }
    }
    return $out;
+]

FILTER_LOOP:
--:-:-:-:1      ISETP.GT.AND P0, PT, C, RZ, PT;
[+
    our ($dtype, $convert_in, $FX, $vsize, $dsize, $K);
    my %insert = (

        j0c1  => "--:-:-:-:1      ISETP.GT.AND P1, PT, C, 4, PT;\n" .
                 "--:-:-:-:1      IADD C, C, -4;\n",

        $FX ? (
            $convert_in ? (
                j1c8  => "02:-:-:-:1  \@P0 $convert_in F03, F01.H1;\n",
                j1c12 => "--:-:-:-:1  \@P0 $convert_in F02, F01.H0;\n",
                j1c16 => "--:-:-:-:1  \@P0 $convert_in F01, F00.H1;\n",
                j1c20 => "--:-:2:-:1  \@P0 $convert_in F00, F00.H0;\n",

                j1c26 => "04:-:-:-:1  \@P0 $convert_in F13, F11.H1;\n",
                j1c30 => "--:-:-:-:1  \@P0 $convert_in F12, F11.H0;\n",
                j1c34 => "--:-:-:-:1  \@P0 $convert_in F11, F10.H1;\n",
                j1c38 => "--:-:3:-:1  \@P0 $convert_in F10, F10.H0;\n",

                j2c8  => "08:-:-:-:1  \@P0 $convert_in F23, F21.H1;\n",
                j2c12 => "--:-:-:-:1  \@P0 $convert_in F22, F21.H0;\n",
                j2c16 => "--:-:-:-:1  \@P0 $convert_in F21, F20.H1;\n",
                j2c20 => "--:-:4:-:1  \@P0 $convert_in F20, F20.H0;\n",

                j2c26 => "10:-:-:-:1  \@P0 $convert_in F33, F31.H1;\n",
                j2c30 => "--:-:-:-:1  \@P0 $convert_in F32, F31.H0;\n",
                j2c34 => "--:-:-:-:1  \@P0 $convert_in F31, F30.H1;\n",
                j2c38 => "--:6:5:-:1  \@P0 $convert_in F30, F30.H0;\n",
            ) : (),

            j1c22 => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 00*4>], F0;\n",
            j1c24 => "02:-:2:-:1  \@P1 LDG.E.CG.$vsize F0, [track0 + 4x<00 * $dsize>];\n",

            j1c40 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 32*4>], F1;\n",
            j1c42 => "04:-:3:-:1  \@P1 LDG.E.CG.$vsize F1, [track0 + 4x<32 * $dsize>];\n",

            j2c22 => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 64*4>], F2;\n",
            j2c24 => "08:-:4:-:1  \@P1 LDG.E.CG.$vsize F2, [track0 + 4x<64 * $dsize>];\n",

            j2c40 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 96*4>], F3;\n",
            j2c42 => "10:6:5:-:1  \@P1 LDG.E.CG.$vsize F3, [track0 + 4x<96 * $dsize>];\n",

            j3c57 => "20:-:-:-:1  \@P1 IADD   track0.CC, track0, 4x<32*16 * $dsize>;\n",
            j3c62 => "--:-:-:-:1  \@P1 IADD.X track1,    track1, RZ;\n",

        ) : (
            $convert_in ? (
                j0c5  => "02:-:-:-:1      $convert_in f00, f00;\n",
                j0c7  => "--:-:-:-:1      $convert_in f01, f01;\n",
                j0c9  => "--:-:-:-:1      $convert_in f02, f02;\n",

                j0c11 => "--:-:-:-:1      $convert_in f20, f20;\n",
                j0c13 => "--:-:-:-:1      $convert_in f21, f21;\n",
                j0c15 => "--:-:2:-:1      $convert_in f22, f22;\n",

                j0c17 => "--:-:-:-:1      $convert_in f10, f10;\n",
                j0c19 => "--:-:-:-:1      $convert_in f11, f11;\n",
                j0c21 => "--:-:4:-:1      $convert_in f12, f12;\n",
            ) : (),

            j0c33 => "02:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 0)>], F00;\n",
            j0c35 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 3)>], F03;\n",
            j0c37 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 0)>], F30;\n",
            j0c39 => "--:3:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 3)>], F33;\n",

            j0c40 => "--:-:-:-:1  \@P0 FADD tb0, TF00, TF02;\n" .
                     "--:-:-:-:1  \@P0 FADD tb3, TF30, TF32;\n" .
                     "--:-:-:-:1  \@P0 FADD ta0, f00,  f20;\n" .
                     "--:-:-:-:1  \@P0 FADD ta1, f01,  f21;\n" .
                     "--:-:-:-:1  \@P0 FADD ta2, f02,  f22;\n",

            j0c41 => "--:-:-:-:1  \@P0 FMUL tb0, tb0, 0.5;\n" .
                     "--:-:-:-:1  \@P0 FMUL tb3, tb3, 0.5;\n" .
                     "--:-:-:-:1  \@P0 FMUL ta0, ta0, 0.5;\n" .
                     "--:-:-:-:1  \@P0 FMUL ta1, ta1, 0.5;\n" .
                     "--:-:-:-:1  \@P0 FMUL ta2, ta2, 0.5;\n",

            j0c42 => "--:-:-:-:1  \@P0 FFMA F01, TF01,  0.5, tb0;\n" .
                     "--:-:-:-:1  \@P0 FFMA F02, TF01, -0.5, tb0;\n" .
                     "--:-:-:-:1  \@P0 FFMA F31, TF31,  0.5, tb3;\n" .
                     "--:-:-:-:1  \@P0 FFMA F32, TF31, -0.5, tb3;\n",

            j0c45 => "04:-:-:-:1  \@P1 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];\n",
            j0c47 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];\n",
            j0c49 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];\n",

            j0c51 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];\n",
            j0c53 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];\n",
            j0c55 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];\n",

            j1c8  => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 1)>], F01;\n",
            j1c10 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 2)>], F02;\n",
            j1c12 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 1)>], F31;\n",
            j1c14 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 2)>], F32;\n",

            j1c15 => "08:-:-:-:1  \@P0 FFMA TF10, f10,  0.5, ta0;\n" .
                     "--:-:-:-:1  \@P0 FFMA TF20, f10, -0.5, ta0;\n" .
                     "--:-:-:-:1  \@P0 FFMA TF11, f11,  0.5, ta1;\n" .
                     "--:-:-:-:1  \@P0 FFMA TF21, f11, -0.5, ta1;\n" .
                     "--:-:-:-:1  \@P0 FFMA TF12, f12,  0.5, ta2;\n" .
                     "--:-:-:-:1  \@P0 FFMA TF22, f12, -0.5, ta2;\n",

            j1c16 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];\n",
            j1c18 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];\n",
            j1c20 => "--:6:2:-:1  \@P1 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];\n",

            j1c22 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 0)>], F10;\n",
            j1c24 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 0)>], F20;\n",
            j1c26 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 3)>], F13;\n",
            j1c28 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 3)>], F23;\n",

            j1c29 => "--:-:-:-:1  \@P0 FADD tb1, TF10, TF12;\n" .
                     "--:-:-:-:1  \@P0 FADD tb2, TF20, TF22;\n",

            j1c34 => "--:-:-:-:1  \@P0 FMUL tb1, tb1, 0.5;\n" .
                     "--:-:-:-:1  \@P0 FMUL tb2, tb2, 0.5;\n",

            j1c39 => "--:-:-:-:1  \@P0 FFMA F11, TF11,  0.5, tb1;\n" .
                     "--:-:-:-:1  \@P0 FFMA F12, TF11, -0.5, tb1;\n" .
                     "--:-:-:-:1  \@P0 FFMA F21, TF21,  0.5, tb2;\n" .
                     "--:-:-:-:1  \@P0 FFMA F22, TF21, -0.5, tb2;\n",

            j2c8  => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 1)>], F11;\n",
            j2c10 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 2)>], F12;\n",
            j2c12 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 1)>], F21;\n",
            j2c14 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 2)>], F22;\n",


            j3c57 => "20:-:-:-:1  \@P1 IADD   track0.CC, track0, param_4RSKp;\n",
            j3c62 => "--:-:-:-:1  \@P1 IADD.X track1,    track1, RZ;\n",
        ),

        j2c62 => "--:-:-:-:5      BAR.SYNC 0;\n" .
                 "--:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<512*4*2>;\n" .
                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<512*4*2>;\n" .
                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<512*4*2>;\n",

        j3c63 => "--:-:-:Y:5  \@P0 BRA.U FILTER_LOOP;\n",
    );
    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2,4,6)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }
    my $out;
    foreach my $j (0 .. 3)
    {
        my $odd      = $j & 1;
        my $nOdd     = !$odd + 0;
        my $rsOffset = ($j + 1) % 4;
        my $rsPred   = $j == 3 ? '@P0' : '   ';

        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;

        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;

            my $yield  = $c == 32 && $stall ? 'Y' : '-';

            my $wait   = $c == 0 ? '01' : '--';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
        }
    }
    return $out;
+]

END_LOOP:
--:-:1:-:1      S2R tid,   SR_TID.X;
--:-:2:-:1      S2R idx_N, SR_CTAID.Z;
--:-:3:-:1      S2R idx_K, SR_CTAID.Y;

<SCHEDULE_BLOCK>
01:-:-:-:1      LOP.AND tid_31, tid, 31;

--:-:-:-:1      BFE idx_n, idx_nkpq, 0x041c;
--:-:-:-:1      BFE idx_k, idx_nkpq, 0x0418;
--:-:-:-:1      BFE idx_P, idx_nkpq, 0x0c0c;
--:-:-:-:1      BFE idx_Q, idx_nkpq, 0x0c00;

02:-:-:-:1      XMAD idx_N, idx_N, param_n, idx_n;
04:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;

[+
    our $bsum; return $bsum ? q{
--:-:-:-:1      XMAD      bsum_offset, idx_Q, param_gridN,   idx_N;
--:-:-:-:1      XMAD.LO2C bsum_offset, idx_P, param_gridQN,  bsum_offset;
    } : '';
+]

// x = grid_x << shiftX
// y = grid_y << shiftY
--:-:-:-:1      SHL p, idx_P, param_shiftP;
--:-:-:-:1      SHL q, idx_Q, param_shiftQ;

// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp
--:-:-:-:1      BFE.U32 superP, tid, param_superP;
--:-:-:-:1      BFE.U32 superQ, tid, param_superQ;
--:-:-:-:1      ISCADD p, superP,  p, 1;
--:-:-:-:1      ISCADD q, superQ,  q, 1;


--:-:-:-:1      LOP.AND superN, tid, param_superN;
--:-:-:-:1      SHL  n, idx_N, param_shiftN;
--:-:-:-:1      IADD n, n, superN;

--:-:-:-:1      MOV alpha, param_alpha;
--:-:-:-:1      MOV32I one, 1.0;

// readFs = ((tid &  8) >> 2) | (tid & 1)
--:-:-:-:1      LOP.AND  tid_1,  tid,    1;
--:-:-:-:1      LOP.AND  readFs, tid,    8;
--:-:-:-:1      SHR.U32  readFs, readFs, 2;
--:-:-:-:1      LOP.OR   readFs, readFs,  tid_1;
//--:-:-:-:1      SHL      readFs, readFs, 3;

// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2)
--:-:-:-:1      LOP.AND  tid_16, tid,   -16;
--:-:-:-:1      SHR.U32  tid_16, tid_16, 1;
--:-:-:-:1      BFE.U32  readIs, tid,    0x201; // 2 bits at position 1
--:-:-:-:1      LOP.OR   readIs, readIs, tid_16;
--:-:-:-:1      ISCADD   readIs, readFs, readIs, 2;
--:-:-:-:1      SHL      readIs, readIs, 4;

// writeCs = readFs * 512 + readIs;
--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 12;

// readCs = tid32 * 512 + tid_31 + tid_64 * 16
--:-:-:-:1      SHR.U32 tid32, tid,  5;
--:-:-:-:1      SHR.U32 tid64, tid,  6;
--:-:-:-:1      ISCADD  readCs, tid32, tid_31, 9;
--:-:-:-:1      ISCADD  readCs, tid64, readCs, 4;
--:-:-:-:1      SHL     readCs, readCs, 2;

// k = idx_K*32 + tid32<<1
--:-:-:-:1      SHL tid32, tid32, 1;
--:-:-:-:1      ISCADD  k, idx_K, tid32, 5;

// Out00 = k*PQN + p*QN + q*N + n
// Out01 = Out00 + N
// Out10 = Out00 + QN
// Out11 = Out01 + QN
--:-:-:-:1      XMAD      out_offset, q, 1x<$N>,    n;
--:-:-:-:1      XMAD.LO2C out_offset, p, param_QN,  out_offset;
--:-:-:-:1      XMAD.LO2C out_offset, k, param_PQN, out_offset;


--:-:-:-:1      MOV  PQN15, param_PQN;
--:-:-:-:1      SHL  PQN15, PQN15, 4;
--:-:-:-:1      IADD PQN15, PQN15, -param_PQN;

--:-:-:-:1      IADD q2, q, 1;
--:-:-:-:1      IADD p2, p, 1;


--:-:-:-:1      ISETP.EQ.AND P6, PT, RZ, param_flags, PT; // ! no-op
--:-:-:-:1      ISETP.LT.AND P6, PT, n,  1x<$N>,  P6; // n < N
--:-:-:-:1      ISETP.LT.AND P2, PT, p,  param_P, PT; // p0 < P && n < N
--:-:-:-:1      ISETP.LT.AND P3, PT, q,  1x<$Q>,  PT; // q0 < Q && n < N
--:-:-:-:1      ISETP.LT.AND P4, PT, p2, param_P, PT; // p1 < P && n < N
--:-:-:-:1      ISETP.LT.AND P5, PT, q2, 1x<$Q>,  PT; // q1 < Q && n < N

--:-:-:-:1      PSETP.AND.AND P0, PT, P2, P3, P6; // p0 && q0
--:-:-:-:1      PSETP.AND.AND P1, PT, P2, P5, P6; // p0 && q1
--:-:-:-:1      PSETP.AND.AND P2, PT, P4, P3, P6; // p1 && q0
--:-:-:-:1      PSETP.AND.AND P3, PT, P4, P5, P6; // p1 && q1
--:-:-:-:1      P2R preds, PR, RZ, 0x0f;

--:-:-:-:1      ISETP.EQ.AND P6, PT, tid_31, RZ, PT; // tid31 == 0
</SCHEDULE_BLOCK>

<SCHEDULE_BLOCK>
--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
--:-:-:-:1      FMUL shuffle_x7y0, cx7y0, alpha;
--:-:-:-:1      FMUL shuffle_x0y1, cx0y2, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, cx1y2, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, cx2y2, alpha;
--:-:-:-:1      FMUL shuffle_x3y1, cx3y2, alpha;
--:-:-:-:1      FMUL shuffle_x4y1, cx4y2, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, cx5y2, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, cx6y2, alpha;
--:-:-:-:1      FMUL shuffle_x7y1, cx7y2, alpha;

--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
</SCHEDULE_BLOCK>
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5      CAL OUTPUT_TRANSFORM;
--:-:-:-:1      IADD k, k, 1;
--:-:-:-:0      IADD out_offset, out_offset, param_PQN;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      FMUL shuffle_x0y0, cx0y1, alpha;
--:-:-:-:1      FMUL shuffle_x1y0, cx1y1, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, cx2y1, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, cx3y1, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, cx4y1, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, cx5y1, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, cx6y1, alpha;
--:-:-:-:0      FMUL shuffle_x7y0, cx7y1, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
--:-:-:-:1      FMUL shuffle_x0y1, cx0y3, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, cx1y3, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, cx2y3, alpha;
--:-:-:-:0      FMUL shuffle_x3y1, cx3y3, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
--:-:-:-:1      FMUL shuffle_x4y1, cx4y3, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, cx5y3, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, cx6y3, alpha;
--:-:-:-:0      FMUL shuffle_x7y1, cx7y3, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5      CAL OUTPUT_TRANSFORM;
--:-:-:-:1      IADD k, k, 15;
--:-:-:-:0      IADD out_offset, out_offset, PQN15;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      FMUL shuffle_x0y0, cx0y4, alpha;
--:-:-:-:1      FMUL shuffle_x1y0, cx1y4, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, cx2y4, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, cx3y4, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, cx4y4, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, cx5y4, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, cx6y4, alpha;
--:-:-:-:0      FMUL shuffle_x7y0, cx7y4, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
--:-:-:-:1      FMUL shuffle_x0y1, cx0y6, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, cx1y6, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, cx2y6, alpha;
--:-:-:-:0      FMUL shuffle_x3y1, cx3y6, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
--:-:-:-:1      FMUL shuffle_x4y1, cx4y6, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, cx5y6, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, cx6y6, alpha;
--:-:-:-:0      FMUL shuffle_x7y1, cx7y6, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5      CAL OUTPUT_TRANSFORM;
--:-:-:-:1      IADD k, k, 1;
--:-:-:-:0      IADD out_offset, out_offset, param_PQN;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      FMUL shuffle_x0y0, cx0y5, alpha;
--:-:-:-:1      FMUL shuffle_x1y0, cx1y5, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, cx2y5, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, cx3y5, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, cx4y5, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, cx5y5, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, cx6y5, alpha;
--:-:-:-:0      FMUL shuffle_x7y0, cx7y5, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
--:-:-:-:1      FMUL shuffle_x0y1, cx0y7, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, cx1y7, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, cx2y7, alpha;
--:-:-:-:0      FMUL shuffle_x3y1, cx3y7, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
--:-:-:-:1      FMUL shuffle_x4y1, cx4y7, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, cx5y7, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, cx6y7, alpha;
--:-:-:-:0      FMUL shuffle_x7y1, cx7y7, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5      CAL OUTPUT_TRANSFORM;

--:-:-:-:5      EXIT;

OUTPUT_TRANSFORM:


<SCHEDULE_BLOCK>
11:-:-:-:1      ISETP.LT.AND P4, PT, k, 1x<$K>, PT; // k < K
--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
[+
    our ($beta, $brelu, $bprelu, $dsize, $dshift, $dtype, $Q, $N);
    return $beta || $brelu || $bprelu ? qq{
--:-:-:-:1      LEA      Out0.CC, out_offset, param_X[0],     $dshift;
--:-:-:-:1      LEA.HI.X Out1,    out_offset, param_X[1], RZ, $dshift;

--:-:-:-:1 \@!P0 MOV b00, RZ;
--:-:-:-:1 \@!P1 MOV b01, RZ;
--:-:-:-:1 \@!P2 MOV b10, RZ;
--:-:-:-:1 \@!P3 MOV b11, RZ;
<ORDERED>
--:-:-:-:1  \@P0 LDG.E.CI.$dtype b00, [Out + ${dsize}x<0*$Q*$N + 0*$N>];
--:-:5:-:1  \@P1 LDG.E.CI.$dtype b01, [Out + ${dsize}x<0*$Q*$N + 1*$N>];
--:-:-:-:1  \@P2 LDG.E.CI.$dtype b10, [Out + ${dsize}x<1*$Q*$N + 0*$N>];
--:-:6:-:1  \@P3 LDG.E.CI.$dtype b11, [Out + ${dsize}x<1*$Q*$N + 1*$N>];
</ORDERED>
    } : '';
+]
[+
    our $bias; return $bias ? q{
// sum = S + k
20:-:-:-:1      LEA      Sum0.CC, k, param_S[0],     2;
--:-:-:-:1      LEA.HI.X Sum1,    k, param_S[1], RZ, 2;

--:-:-:-:1 @!P4 MOV b00, RZ;
--:-:5:-:1  @P4 LDG.E.CI b00, [Sum];
    } : '';
+]
</SCHEDULE_BLOCK>

--:-:-:-:1      LDS m00, [readCs + 4x< 0*32>];
--:-:-:-:1      LDS m01, [readCs + 4x< 1*32>];
--:-:-:-:1      LDS m02, [readCs + 4x< 2*32>];
--:-:1:Y:1      LDS m03, [readCs + 4x< 3*32>];

--:-:-:-:1      LDS m10, [readCs + 4x< 4*32>];
--:-:-:-:1      LDS m11, [readCs + 4x< 5*32>];
--:-:-:-:1      LDS m12, [readCs + 4x< 6*32>];
--:-:2:Y:1      LDS m13, [readCs + 4x< 7*32>];

--:-:-:-:1      LDS m20, [readCs + 4x< 8*32>];
--:-:-:-:1      LDS m21, [readCs + 4x< 9*32>];
--:-:-:-:1      LDS m22, [readCs + 4x<10*32>];
--:-:3:Y:1      LDS m23, [readCs + 4x<11*32>];

--:-:-:-:1      LDS m30, [readCs + 4x<12*32>];
--:-:-:-:1      LDS m31, [readCs + 4x<13*32>];
--:-:-:-:1      LDS m32, [readCs + 4x<14*32>];
--:-:4:Y:1      LDS m33, [readCs + 4x<15*32>];

<SCHEDULE_BLOCK>
// t00 = m00+m01+m02;
// t01 = m01-m02-m03;
01:-:-:-:1      FADD t00, m00,  m01;
--:-:-:-:1      FADD t00, t00,  m02;
--:-:-:-:1      FADD t01, m01, -m02;
--:-:-:-:1      FADD t01, t01, -m03;
// t10 = m10+m11+m12;
// t11 = m11-m12-m13;
02:-:-:-:1      FADD t10, m10,  m11;
--:-:-:-:1      FADD t10, t10,  m12;
--:-:-:-:1      FADD t11, m11, -m12;
--:-:-:-:1      FADD t11, t11, -m13;
// t20 = m20+m21+m22;
// t21 = m21-m22-m23;
04:-:-:-:1      FADD t20, m20,  m21;
--:-:-:-:1      FADD t20, t20,  m22;
--:-:-:-:1      FADD t21, m21, -m22;
--:-:-:-:1      FADD t21, t21, -m23;
// t30 = m30+m31+m32;
// t31 = m31-m32-m33;
08:-:-:-:1      FADD t30, m30,  m31;
--:-:-:-:1      FADD t30, t30,  m32;
--:-:-:-:1      FADD t31, m31, -m32;
--:-:-:-:1      FADD t31, t31, -m33;
// y00 = t00+t10+t20;
// y01 = t01+t11+t21;
--:-:-:-:1      FADD s00, t00,  t10;
--:-:-:-:1      FADD s00, s00,  t20;
--:-:-:-:1      FADD s01, t01,  t11;
--:-:-:-:1      FADD s01, s01,  t21;
// y10 = t10-t20-t30;
// y11 = t11-t21-t31;
--:-:-:-:1      FADD s10, t10, -t20;
--:-:-:-:1      FADD s10, s10, -t30;
--:-:-:-:1      FADD s11, t11, -t21;
--:-:-:-:3      FADD s11, s11, -t31;

[+
    our $bias; return $bias ? q{
10:-:-:-:1  @P0 FADD s00, s00, b00;
--:-:-:-:1  @P1 FADD s01, s01, b00;
--:-:-:-:1  @P2 FADD s10, s10, b00;
--:-:-:-:1  @P3 FADD s11, s11, b00;
    } : '';
+]
[+
    our $relu; return $relu ? q{
// maximum(x, 0) + slope * minimum(0, x)
--:-:-:-:1  @P0 FMNMX s00, s00, RZ, !PT;
--:-:-:-:1  @P1 FMNMX s01, s01, RZ, !PT;
--:-:-:-:1  @P2 FMNMX s10, s10, RZ, !PT;
--:-:-:-:1  @P3 FMNMX s11, s11, RZ, !PT;
    } : '';
+]
[+
    our $prelu; return $prelu ? q{
// maximum(x, 0) + slope * minimum(0, x)
--:-:-:-:1  @P0 FMNMX b00, s00, RZ, !PT;
--:-:-:-:1  @P1 FMNMX b01, s01, RZ, !PT;
--:-:-:-:1  @P2 FMNMX b10, s10, RZ, !PT;
--:-:-:-:1  @P3 FMNMX b11, s11, RZ, !PT;

--:-:-:-:1  @P0 FMNMX x00, s00, RZ, PT;
--:-:-:-:1  @P1 FMNMX x01, s01, RZ, PT;
--:-:-:-:1  @P2 FMNMX x10, s10, RZ, PT;
--:-:-:-:1  @P3 FMNMX x11, s11, RZ, PT;

--:-:-:-:1  @P0 FFMA s00, x00, param_beta, b00;
--:-:-:-:1  @P1 FFMA s01, x01, param_beta, b01;
--:-:-:-:1  @P2 FFMA s10, x10, param_beta, b10;
--:-:-:-:1  @P3 FFMA s11, x11, param_beta, b11;
    } : '';
+]
</SCHEDULE_BLOCK>

<SCHEDULE_BLOCK>
[+
    our ($beta, $brelu, $bprelu, $convert_in);
    return $convert_in && ($beta || $brelu || $bprelu) ? qq{
10:-:1:-:1  \@P0 $convert_in b00, b00;
--:-:2:-:1  \@P1 $convert_in b01, b01;
20:-:3:-:1  \@P2 $convert_in b10, b10;
--:-:4:-:1  \@P3 $convert_in b11, b11;
    } : '';
+]
[+
    our $beta; return $beta ? q{
11:-:-:-:1  @P0 FFMA s00, b00, param_beta, s00;
02:-:-:-:1  @P1 FFMA s01, b01, param_beta, s01;
24:-:-:-:1  @P2 FFMA s10, b10, param_beta, s10;
08:-:-:-:1  @P3 FFMA s11, b11, param_beta, s11;
    } : '';
+]
[+
    our $brelu; return $brelu ? q{
//delta *= x > 0
11:-:-:-:1      FSETP.GT.AND P0, PT, b00, RZ, PT;
02:-:-:-:1      FSETP.GT.AND P1, PT, b01, RZ, PT;
24:-:-:-:1      FSETP.GT.AND P2, PT, b10, RZ, PT;
08:-:-:-:1      FSETP.GT.AND P3, PT, b11, RZ, PT;
--:-:-:-:1 @!P0 MOV s00, RZ;
--:-:-:-:1 @!P1 MOV s01, RZ;
--:-:-:-:1 @!P2 MOV s10, RZ;
--:-:-:-:1 @!P3 MOV s11, RZ;
--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
--:-:-:-:5 @!P4 R2P PR, RZ, 0x0f;
    } : '';
+]
[+
    our $bprelu; return $bprelu ? q{
//delta *= ((x > 0) + slope * (x < 0))
11:-:-:-:1      FSETP.GT.AND P0, PT, b00, RZ, PT;
02:-:-:-:1      FSETP.GT.AND P1, PT, b01, RZ, PT;
24:-:-:-:1      FSETP.GT.AND P2, PT, b10, RZ, PT;
08:-:-:-:1      FSETP.GT.AND P3, PT, b11, RZ, PT;
--:-:-:-:1      SEL x00, one, RZ, P0;
--:-:-:-:1      SEL x01, one, RZ, P1;
--:-:-:-:1      SEL x10, one, RZ, P2;
--:-:-:-:1      SEL x11, one, RZ, P3;
--:-:-:-:1      FSETP.LT.AND P0, PT, b00, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P1, PT, b01, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P2, PT, b10, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P3, PT, b11, RZ, PT;
--:-:-:-:1      SEL b00, one, RZ, P0;
--:-:-:-:1      SEL b01, one, RZ, P1;
--:-:-:-:1      SEL b10, one, RZ, P2;
--:-:-:-:1      SEL b11, one, RZ, P3;
--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
--:-:-:-:1      FFMA b00, b00, param_beta, x00;
--:-:-:-:1      FFMA b01, b01, param_beta, x01;
--:-:-:-:1      FFMA b10, b10, param_beta, x10;
--:-:-:-:1      FFMA b11, b11, param_beta, x11;
--:-:-:-:1      FMUL s00, s00, b00;
--:-:-:-:1      FMUL s01, s01, b01;
--:-:-:-:1      FMUL s10, s10, b10;
--:-:-:-:1      FMUL s11, s11, b11;
    } : '';
+]
[+
    our $bsum; return $bsum ? q{
--:-:-:-:1      MOV sum0, RZ;
--:-:-:-:1  @P0 FADD sum0, s00, sum0;
--:-:-:-:1  @P1 FADD sum0, s01, sum0;
--:-:-:-:1  @P2 FADD sum0, s10, sum0;
--:-:-:-:1  @P3 FADD sum0, s11, sum0;
    } : '';
+]
</SCHEDULE_BLOCK>

[+
    our $convert_out;
    return $convert_out ? qq{
--:-:1:-:1      $convert_out s00, s00;
--:-:2:-:1      $convert_out s01, s01;
--:-:3:-:1      $convert_out s10, s10;
--:-:4:-:1      $convert_out s11, s11;
    } : '';
+]


<SCHEDULE_BLOCK>
--:-:-:-:1      LEA      Out0.CC, out_offset, param_O[0],     [+ dshift() +];
--:-:-:-:1      LEA.HI.X Out1,    out_offset, param_O[1], RZ, [+ dshift() +];

// k < K && R2P && output
01:-:-:-:1  @P0 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<0*$Q*$N + 0*$N>], s00;
02:-:-:-:1  @P1 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<0*$Q*$N + 1*$N>], s01;
04:-:-:-:1  @P2 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<1*$Q*$N + 0*$N>], s10;
08:1:-:-:1  @P3 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<1*$Q*$N + 1*$N>], s11;
</SCHEDULE_BLOCK>

[+
    our $bsum;
    return $bsum ? q{
<SCHEDULE_BLOCK>
--:-:-:-:1      XMAD.LO2C b00, k, param_gridPQN, bsum_offset;

--:-:-:-:1      LEA      Sum0.CC, b00, param_S[0],     2;
--:-:-:-:1      LEA.HI.X Sum1,    b00, param_S[1], RZ, 2;

--:-:-:-:1      PSETP.AND.AND P5, PT, P4, P6, PT; // k < K && tid31 == 0

--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  1, 0x1f;
10:-:-:-:4      FADD sum0, sum1, sum0;
--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  2, 0x1f;
10:-:-:-:4      FADD sum0, sum1, sum0;
--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  4, 0x1f;
10:-:-:-:4      FADD sum0, sum1, sum0;
--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  8, 0x1f;
10:-:-:-:4      FADD sum0, sum1, sum0;
--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 16, 0x1f;
10:-:-:-:2      FADD sum0, sum1, sum0;

--:5:-:-:1  @P5 STG.E.CG [Sum], sum0;
</SCHEDULE_BLOCK>
    } : '';
+]

--:-:-:-:5      RET;



//     T0 = np.empty((4,4))
//     T1 = np.empty((4,4))
//
//     for O, I in ((T0, I), (T1, T0.T)):
//
//         O[0,:] = I[0,:] - I[2,:]
//         O[1,:] = I[1,:] + I[2,:]
//         O[2,:] = I[2,:] - I[1,:]
//         O[3,:] = I[1,:] - I[3,:]
//
//     Iw[:] = T1.T
//
// 0  = i00
// 1  = i01
// 2  = i02
// 3  = i03
// 4  = i30
// 5  = i31
// 6  = i32
// 7  = i33
// 8  = i13
// 9  = i12
// 10 = i11
// 11 = i10
// 12 = i23, TI23, I23
// 13 = i22, TI22
// 14 = i21, TI21
// 15 = i20, TI20, I20
// 16 = TI00, I00, TI10, I10, I21, I01
// 17 = TI01, I11
// 18 = TI02, I12
// 19 = TI03, I03, TI11, I31
// 20 = TI30, I30, TI12, I32
// 21 = TI31
// 22 = TI32
// 23 = TI33, I33, TI13, I13, I22, I02
//
//
// TI00 = i00 - i20
// TI01 = i01 - i21
// TI02 = i02 - i22
// TI03 = i03 - i23
// # load 0
//
// TI30 = i10 - i30
// TI31 = i11 - i31
// TI32 = i12 - i32
// TI33 = i13 - i33
// # load 3
//
// I00 = TI00 - TI02
// I03 = TI01 - TI03
// I30 = TI30 - TI32
// I33 = TI31 - TI33
// # store 0
//
// # wait 0
// TI10 = i10 + i20
// TI11 = i11 + i21
// TI12 = i12 + i22
// TI13 = i13 + i23
//
// TI20 = i20 - i10
// TI21 = i21 - i11
// TI22 = i22 - i12
// TI23 = i23 - i13
//
// #load 1
//
// I10 = TI10 - TI12
// I20 = TI20 - TI22
// I13 = TI11 - TI13
// I23 = TI21 - TI23
// # store 1
//
// # wait 1
// I21 = TI21 + TI22
// I22 = TI22 - TI21
// # store 2
//
// # load 2
//
// # wait 2
// I01 = TI01 + TI02
// I02 = TI02 - TI01
// I11 = TI11 + TI12
// I12 = TI12 - TI11
// I31 = TI31 + TI32
// I32 = TI32 - TI31
// #store 3



//     T0 = np.empty((4,3))
//     T1 = np.empty((4,4))
//
//     for O, I in ((T0, F), (T1, T0.T)):
//
//         t0 = (I[0,:] + I[2,:])*0.5
//
//         O[0,:] = I[0,:]
//         O[1,:] = t0 + I[1,:]*0.5
//         O[2,:] = t0 - I[1,:]*0.5
//         O[3,:] = I[2,:]
//
//     Fw[:] = T1.T
//
// 0  = f00, TF00, F00
// 1  = f01, TF01
// 2  = f02, TF02, F03
// 3  = f10
// 4  = f11
// 5  = f12
// 6  = f20, TF30, F30
// 7  = f21, TF31
// 8  = f22, TF32, F33
// 9  = tb3, F32
// 10 = tb0, F02
// 11 = ta2, TF22, F23
// 12 = ta0, TF20, F20
// 13 = ta1, TF21
// 14 = F01
// 15 = F31
// 16 = TF10, F10
// 17 = TF11
// 18 = TF12, F13
// 19 = tb1, F12
// 20 = tb2, F22
// 21 = F11
// 22 = F21
// 23 =
//
//
// TF00 = f00
// TF01 = f01
// TF02 = f02
// TF30 = f20
// TF31 = f21
// TF32 = f22
//
// F00 = TF00
// F03 = TF02
// F30 = TF30
// F33 = TF32
//
// # store 0
//
// tb0 = TF00 + TF02
// tb3 = TF30 + TF32
// ta0 = f00 + f20
// ta1 = f01 + f21
// ta2 = f02 + f22
//
// tb0 = tb0 * 0.5
// tb3 = tb3 * 0.5
// ta0 = ta0 * 0.5
// ta1 = ta1 * 0.5
// ta2 = ta2 * 0.5
//
// F01 = tb0 + TF01*0.5
// F02 = tb0 - TF01*0.5
// F31 = tb3 + TF31*0.5
// F32 = tb3 - TF31*0.5
//
// # wait 0
// # load 0, 2
// # store 1
//
// TF10 = ta0 + f10*0.5
// TF20 = ta0 - f10*0.5
// TF11 = ta1 + f11*0.5
// TF21 = ta1 - f11*0.5
// TF12 = ta2 + f12*0.5
// TF22 = ta2 - f12*0.5
//
// # load 1
//
// F10 = TF10
// F20 = TF20
// F13 = TF12
// F23 = TF22
//
// # store 2
//
// tb1 = TF10 + TF12
// tb2 = TF20 + TF22
// tb1 = tb1 * 0.5
// tb2 = tb2 * 0.5
//
// F11 = tb1 + TF11*0.5
// F12 = tb1 - TF11*0.5
// F21 = tb2 + TF21*0.5
// F22 = tb2 - TF21*0.5
//
// # store 3//